pacman::p_load(dplyr, ggplot2, readr, haven, broom, purrr, tidyr, magrittr, labelled, sjPlot, viridis, forcats, ggthemes, cluster, factoextra, fpc)
ches <- get(load("data/Rdata/ches_final.Rdata"))
range01 <- function(x){(x - min(x, na.rm = T)) / (max(x, na.rm = T) - min(x, na.rm = T))}
ches <- ches %>%
mutate(populism = antielite_salience + corrupt_salience) %>%
mutate(populism2 = range01(range01(antielite_salience) +
(1 - range01(eu_position)))*100) %>% #+
# (1 - range01(eu_budgets)))*100) %>%
mutate(liberalism = sociallifestyle + civlib_laworder + galtan) %>%
mutate(populism = range01(populism)*100) %>%
mutate(liberalism = range01(liberalism)*100) #%>%
#filter(year > 2009)
ches %>%
ggplot(aes(liberalism, populism, colour = eu_position)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ poly(x, 2))+
#geom_text_repel(aes(liberalism, populism, label = party_cntry)) +
ggthemes::theme_hc() +
viridis::scale_color_viridis()
set.seed(2018)
ches_cluster_data <- ches %>%
select(party_name, vote_id, liberalism, populism2) %>%
drop_na(liberalism, populism2) %>%
as.data.frame()
ches_cluster <- ches_cluster_data %>%
select(-party_name, -vote_id) %>%
purrr::map_df(scale)
distance <- get_dist(ches_cluster)
fviz_dist(distance,
gradient = list(low = "#00AFBB",
mid = "white",
high = "#FC4E07"))
kmeans() function returns a list of components, including:
cluster: A vector of integers (from 1:k) indicating the cluster to which each point is allocatedcenters: A matrix of cluster centers (clustqer means)totss: The total sum of squares (TSS), i.e (xi ≠ x ̄)2. TSS measures the total variance in the data.withinss: Vector of within-cluster sum of squares, one component per clustertot.withinss: Total within-cluster sum of squares, i.e. sum(withinss)betweenss: The between-cluster sum of squares, i.e. totss ≠ tot.withinsssize: The number of observations in each clusterk3 <- kmeans(ches_cluster, centers = 3, nstart = 25, iter.max = 10)
ggf <- fviz_cluster(k3, data = ches_cluster, show.clust.cent = T, text = "vote_id")
ggf + theme_gdocs()
fviz_cluster(
k3,
data = ches_cluster,
palette = c("#2E9FDF", "#00AFBB", "#E7B800"),
ellipse.type = "euclid", # Concentration ellipse star.plot = TRUE, # Add segments from centroids to items repel = TRUE, # Avoid label overplotting (slow)
ggtheme = theme_minimal()
)
# Elbow method
fviz_nbclust(ches_cluster, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(subtitle = "Elbow method") +
theme_gdocs()
# Silhouette method
fviz_nbclust(ches_cluster, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method") +
theme_gdocs()
# Gap statistic
# nboot = 50 to keep the function speedy.
# recommended value: nboot= 500 for your analysis.
# Use verbose = FALSE to hide computing progression.
fviz_nbclust(ches_cluster, kmeans, nstart = 25, method = "gap_stat", nboot = 50) +
labs(subtitle = "Gap statistic method") +
theme_gdocs()
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 50) [one "." per sample]:
.................................................. 50
According to these observations, it’s possible to define k = 4 as the optimal number of clusters in the data.
2 dimensions do not need pca
library(purrr)
res <- purrr::map(2:8, ~ kmeans(ches_cluster, .))
library(ggfortify)
autoplot(res, data = ches_cluster, ncol = 3) + theme(legend.position = "none")
normal scatterplots
k_cluster_dat <- 2:8 %>%
purrr::map(~ kmeans(ches_cluster, .x)$cluster) %>%
reduce(cbind) %>%
as_tibble() %>%
set_names(paste0("k", 2:8)) %>%
cbind(ches_cluster, .)
k_cluster_dat %>%
gather("k", "value", -liberalism, -populism2) %>%
ggplot(aes(liberalism, populism2, colour = as.factor(value))) +
geom_point() +
facet_wrap(~k) +
scale_colour_viridis(discrete = T, direction = -1) +
theme_hc() +
theme(legend.position = "none")
cbind(ches_cluster, cluster = k3$cluster) %>%
group_by(cluster) %>%
summarise_all(.funs = list(m = mean, s = sd))
The most common k-medoids clustering methods is the PAM algorithm (Partitioning Around Medoids, Kaufman & Rousseeuw, 1990).
res_pam <- pam(ches_cluster, 3, metric = "euclidean", stand = FALSE)
res_pam$clustering
[1] 1 1 1 1 2 2 2 2 2 1 3 1 3 2 1 2 1 2 1 3 1 1 2 2 2 1 1 2 3 3 1 1 2 2 1 3 3
[38] 3 1 1 3 1 2 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 1 2 3 3 2 2 2 2 1 1 2 2 2 1 1 1
[75] 1 1 3 2 3 2 1 1 2 3 3 2 2 2 2 1 2 1 1 3 1 2 3 1 1 2 1 1 1 1 1 3 1 2 2 2 1
[112] 3 2 2 3 1 1 3 3 1 2 2 1 3 1 1 2 1 1 1 2 2 2 1 3 1 1 2 2 2 3 3 2 2 1 3 3 2
[149] 2 2 3 2 1 2 1 3 3 2 2 2 1 1 1 2 3 3 1 1 1 3 2 2 3 2 3 3 2 2 2 3 3 2 1 3 1
[186] 2 3 2 1 3 3 3 2 2 2 2 2 2 3 2 2 2 2 2 2 3 1 2 3 3 2 2 1 2 2 2 2 1 1 1 2 1
[223] 2 1 1 1 3 3 1 3 1 3 1 3 1 2 3 2 1 3 3 1 1 3 1 2 2 1 1 2 3 3 2 2 1 2 2 1 1
[260] 1 3 1 2 2 1 2 1 1
fviz_nbclust(ches_cluster, pam, method = "silhouette")+
theme_hc()
km_cluster_dat <- 2:8 %>%
purrr::map(~ pam(ches_cluster, k = ., metric = "euclidean", stand = FALSE)$clustering) %>%
reduce(cbind) %>%
as_tibble() %>%
set_names(paste0("k", 2:8)) %>%
cbind(ches_cluster, .)
gg_km <- km_cluster_dat %>%
gather("k", "value", -liberalism, -populism2) %>%
ggplot(aes(liberalism, populism2, colour = as.factor(value))) +
geom_point() +
facet_wrap(~k) +
scale_colour_viridis(discrete = T, direction = -1) +
theme_hc() +
theme(legend.position = "none")
gg_km
res_pam$medoids
liberalism populism2
[1,] -0.9858867 -0.2065320
[2,] 0.2614908 -0.7920694
[3,] 1.3112089 1.2857607
cbind(ches_cluster, cluster = res_pam$clustering) %>%
group_by(cluster) %>%
summarise_all(.funs = list(m = median))
fviz_nbclust(ches_cluster, clara, method = "silhouette")+
theme_classic()
res.dist <- dist(ches_cluster, method = "euclidean")
res.hc <- hclust(d = res.dist, method = "ward.D2")
fviz_dend(res.hc, cex = 0.5)
# Cut tree into 3 groups
grp <- cutree(res.hc, k = 3)
fviz_dend(
res.hc,
k = 3, # Cut in four groups
cex = 0.5, # label size
k_colors = c("#2E9FDF", "#00AFBB", "#E7B800"),
color_labels_by_k = TRUE, # color labels by groups
rect = TRUE # Add rectangle around groups
)
fviz_dend(
res.hc,
cex = 1,
k = 3,
k_colors = "jco",
type = "circular"
)
require("igraph")
Lade nötiges Paket: igraph
Attache Paket: ‘igraph’
The following object is masked from ‘package:clValid’:
clusters
The following object is masked from ‘package:tidyr’:
crossing
The following objects are masked from ‘package:purrr’:
compose, simplify
The following objects are masked from ‘package:dplyr’:
as_data_frame, groups, union
The following objects are masked from ‘package:stats’:
decompose, spectrum
The following object is masked from ‘package:base’:
union
ggrep <- fviz_dend(res.hc, k = 3, k_colors = "jco",
type = "phylogenic", repel = TRUE)
ggrep
fviz_dend(res.hc, k = 3, # Cut in four groups
k_colors = "jco",
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_with_drl")
fviz_dend(res.hc, k = 3, # Cut in four groups
k_colors = "jco",
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_as_tree")
fviz_dend(res.hc, k = 3, # Cut in four groups
k_colors = "jco",
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.gem")
fviz_dend(res.hc, k = 3, # Cut in four groups
k_colors = "jco",
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout.mds")
gg10 <- fviz_dend(res.hc, k = 3, # Cut in four groups
k_colors = "jco",
type = "phylogenic",
repel = TRUE,
phylo_layout = "layout_with_lgl")
gg10
library(clValid)
# Iris data set:
# - Remove Species column and scale df <- scale(iris[, -5])
# Compute clValid
clmethods <- c("hierarchical","kmeans","pam")
intern <- clValid(
ches_cluster %>% as.matrix,
nClust = 2:8,
clMethods = clmethods,
validation = "internal"
)
rownames for data not specified, using 1:nrow(data)
summary(intern)
Clustering Methods:
hierarchical kmeans pam
Cluster sizes:
2 3 4 5 6 7 8
Validation Measures:
2 3 4 5 6 7 8
hierarchical Connectivity 10.2683 17.5187 24.7937 35.0802 42.4873 45.0024 54.0444
Dunn 0.0489 0.0569 0.0586 0.0678 0.0830 0.0886 0.0939
Silhouette 0.4421 0.3938 0.4172 0.3977 0.3853 0.3710 0.3428
kmeans Connectivity 27.6107 28.8794 42.6972 48.6603 59.4067 59.1024 80.4679
Dunn 0.0336 0.0253 0.0327 0.0478 0.0201 0.0357 0.0419
Silhouette 0.4495 0.4394 0.4634 0.4296 0.3995 0.4060 0.3596
pam Connectivity 26.5952 39.1357 39.8861 38.1683 67.1135 73.0849 78.7849
Dunn 0.0357 0.0331 0.0306 0.0334 0.0264 0.0243 0.0254
Silhouette 0.4486 0.4179 0.4614 0.4267 0.4014 0.4006 0.3771
Optimal Scores:
Score Method Clusters
Connectivity 10.2683 hierarchical 2
Dunn 0.0939 hierarchical 8
Silhouette 0.4634 kmeans 4
# Stability measures
clmethods <- c("hierarchical","kmeans","pam")
stab <- clValid(
ches_cluster %>% as.matrix,
nClust = 2:6,
clMethods = clmethods,
validation = "stability"
) # Display only optimal Scores
optimalScores(stab)
The model parameters can be estimated using the Expectation-Maximization (EM) algorithm initialized by hierarchical model-based clustering. Each cluster k is centered at the means μk, with increased density for points near the mean.
library(mclust)
mc <- Mclust(ches_cluster) # Model-based-clustering
fitting ...
|
| | 0%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|===== | 7%
|
|====== | 8%
|
|====== | 9%
|
|======= | 9%
|
|======= | 10%
|
|======== | 11%
|
|======== | 12%
|
|========= | 13%
|
|========== | 14%
|
|========== | 15%
|
|=========== | 16%
|
|============ | 17%
|
|============= | 18%
|
|============= | 19%
|
|============== | 20%
|
|=============== | 21%
|
|=============== | 22%
|
|================ | 23%
|
|================= | 24%
|
|================== | 25%
|
|================== | 26%
|
|=================== | 27%
|
|=================== | 28%
|
|==================== | 28%
|
|==================== | 29%
|
|===================== | 30%
|
|===================== | 31%
|
|====================== | 31%
|
|======================= | 32%
|
|======================= | 33%
|
|======================== | 34%
|
|======================== | 35%
|
|========================= | 35%
|
|========================= | 36%
|
|========================== | 37%
|
|========================== | 38%
|
|=========================== | 39%
|
|============================ | 39%
|
|============================ | 40%
|
|============================= | 41%
|
|============================= | 42%
|
|============================== | 43%
|
|=============================== | 44%
|
|=============================== | 45%
|
|================================ | 46%
|
|================================= | 46%
|
|================================= | 47%
|
|================================== | 48%
|
|================================== | 49%
|
|=================================== | 50%
|
|==================================== | 51%
|
|==================================== | 52%
|
|===================================== | 53%
|
|===================================== | 54%
|
|====================================== | 54%
|
|======================================= | 55%
|
|======================================= | 56%
|
|======================================== | 57%
|
|========================================= | 58%
|
|========================================= | 59%
|
|========================================== | 60%
|
|========================================== | 61%
|
|=========================================== | 61%
|
|============================================ | 62%
|
|============================================ | 63%
|
|============================================= | 64%
|
|============================================= | 65%
|
|============================================== | 65%
|
|============================================== | 66%
|
|=============================================== | 67%
|
|=============================================== | 68%
|
|================================================ | 69%
|
|================================================= | 69%
|
|================================================= | 70%
|
|================================================== | 71%
|
|================================================== | 72%
|
|=================================================== | 72%
|
|=================================================== | 73%
|
|==================================================== | 74%
|
|==================================================== | 75%
|
|===================================================== | 76%
|
|====================================================== | 77%
|
|======================================================= | 78%
|
|======================================================= | 79%
|
|======================================================== | 80%
|
|========================================================= | 81%
|
|========================================================= | 82%
|
|========================================================== | 83%
|
|=========================================================== | 84%
|
|============================================================ | 85%
|
|============================================================ | 86%
|
|============================================================= | 87%
|
|============================================================== | 88%
|
|============================================================== | 89%
|
|=============================================================== | 90%
|
|=============================================================== | 91%
|
|================================================================ | 91%
|
|================================================================ | 92%
|
|================================================================= | 93%
|
|================================================================== | 94%
|
|=================================================================== | 95%
|
|=================================================================== | 96%
|
|==================================================================== | 97%
|
|==================================================================== | 98%
|
|===================================================================== | 98%
|
|===================================================================== | 99%
|
|======================================================================| 100%
summary(mc)
----------------------------------------------------
Gaussian finite mixture model fitted by EM algorithm
----------------------------------------------------
Mclust EII (spherical, equal volume) model with 4 components:
log.likelihood n df BIC ICL
-655.0623 268 12 -1377.216 -1431.308
Clustering table:
1 2 3 4
91 88 53 36
# BIC values used for choosing the number of clusters
fviz_mclust(mc, "BIC", palette = "jco")
# Classification: plot showing the clustering
fviz_mclust(mc, "classification", geom = "point",
pointsize = 1.5, palette = "jco") # Classification uncertainty
fviz_mclust(mc, "uncertainty", palette = "jco")
ches %>%
ggplot(aes(liberalism, populism, colour = mc$classification)) +
geom_point() +
#geom_smooth(method = "lm", formula = y ~ poly(x, 2))+
#geom_text_repel(aes(liberalism, populism, label = party_cntry)) +
ggthemes::theme_hc() +
viridis::scale_color_viridis() +
geom_density2d(alpha = .7, color = "gray") # Add 2D density
library(highcharter)
df1 <- tibble(vote = paste(ches_cluster_data$party_name, ches_cluster_data$vote_id, sep = " "), cluster = k3$cluster, x = as.vector(ches_cluster$liberalism), y = as.vector(ches_cluster$populism2)) %>%
filter(stringr::str_detect(vote, "DE_"))
hchart(df1, hcaes(x = x, y = y, name = vote, color = cluster), type = 'scatter') %>%
hc_add_theme(hc_theme_smpl()) %>%
hc_tooltip(
formatter = JS("function(){
return ('Party: <strong>' + this.point.vote + '</strong><br> X: ' + this.x + ' <br> Y: ' + this.y + ' <br>')
}")) %>%
hc_chart(zoomType = "xy")
ggmap
Warnung in gzfile(file, "wb")
kann komprimierte Datei '/Users/simonroth/Dropbox/projects/pol_efficacy/.Rproj.user/shared/notebooks/CDBFB3FD-A2_ches_clustering/1/473BEA9DF7ECD836/c78nbmwozwwyp_t/524d23442b1d4dbeae748ccb41943d92.snapshot' nicht öffnen. Grund evtl. 'No such file or directory'
Fehler in gzfile(file, "wb") : kann Verbindung nicht öffnen
Fehler in (function (which = dev.cur()) :
QuartzBitmap_Output - unable to open file '/Users/simonroth/Dropbox/projects/pol_efficacy/.Rproj.user/shared/notebooks/CDBFB3FD-A2_ches_clustering/1/473BEA9DF7ECD836/c78nbmwozwwyp_t/_rs_chunk_plot_001.png'
#ess_clean <- ess_sub %>%
# mutate(eu_member =
# recode_factor(cntry,
# DE = 1958, BE = 1958, FR = 1958, NL = 1958, IE = 1973,
# GB = 1973, FI = 1995, AT = 1995, SE = 1995, EE = 2004,
# PL = 2004, SI = 2004, CZ = 2004, CH = 0, IL = 0,
# IS = 0, NO = 0, RU = 0
# )
# ) %>%
# mutate(post_com = ifelse(region %in% c("Estonia", "Poland", "Slovenia", "Czech Republic", "Russian Federation"), "Post C", "West"))
p5 <- step5 %>%
select(gndr, edu, income, rel, year) %>%
gather("var", "value") %>%
ggplot(aes(value, fill = var)) +
geom_bar() +
facet_wrap(~var, scales = "free") +
viridis::scale_fill_viridis(discrete = T)
p5